# download_openpraxis.py
# Open Praxis Downloader
# Automates downloading PDFs from Open Praxis (Ubiquity Press platform)
# - Crawls issue TOC for article URLs
# - Extracts PDF URLs via metadata from article pages
# - Skips Book Reviews and filtered Editorials
# - Creates unique folders with Volume, Issue, Year

"""
Features:
- Downloads all article PDFs from a given Open Praxis issue.
- Skips any article whose title contains "Book Review".
- Skips Editorial articles whose title starts with:
  "Brief report", "Introduction to", or "Editorial Open Praxis".
- Filenames use only the article title.
- Creates a folder based on Volume, Issue, and Year from the issue <title>.

Usage:
1. Run the script.
2. Enter the Open Praxis issue URL (e.g., https://openpraxis.org/10/volume/12/issue/1).
3. PDFs are saved into a folder like: OpenPraxis_Vol12_Issue1_2021
"""


import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

HEADERS = {"User-Agent": "Mozilla/5.0"}

def sanitize(text):
    return re.sub(r'[\\/*?:"<>|]', "", text).strip()

issue_url = input("Enter Open Praxis issue URL: ").strip()

# Fetch issue page
resp = requests.get(issue_url, headers=HEADERS)
soup = BeautifulSoup(resp.text, "html.parser")

parsed = urlparse(issue_url)
base_url = f"{parsed.scheme}://{parsed.netloc}"

# --- Extract Volume, Issue, Year from <title> ---
issue_title_tag = soup.find("title")
if issue_title_tag:
    title_text = issue_title_tag.get_text(strip=True)
    match = re.search(r"Volume\s+(\d+)\s*-\s*Issue\s+(\d+)\s*-\s*(\d{4})", title_text)
    if match:
        volume = match.group(1)
        issue = match.group(2)
        year = match.group(3)
    else:
        volume = "Vol"
        issue = "Issue"
        year = "Year"
else:
    volume = "Vol"
    issue = "Issue"
    year = "Year"

folder = f"OpenPraxis_Vol{volume}_Issue{issue}_{year}"
folder = sanitize(folder)
os.makedirs(folder, exist_ok=True)

# 1️⃣ Collect all article links from the issue page
article_links = []
for a in soup.find_all("a", href=True):
    href = a["href"]
    if "/articles/" in href and "/files/" not in href:
        full_url = urljoin(base_url, href)
        if full_url not in article_links:
            article_links.append(full_url)

print(f"Found {len(article_links)} article pages")

count = 0

# 2️⃣ Visit each article page to extract PDF
for url in article_links:
    r = requests.get(url, headers=HEADERS)
    art_soup = BeautifulSoup(r.text, "html.parser")

    pdf_meta = art_soup.find("meta", {"name": "citation_pdf_url"})
    title_meta = art_soup.find("meta", {"name": "citation_title"})

    if not pdf_meta or not title_meta:
        continue

    pdf_url = pdf_meta["content"]
    title = sanitize(title_meta["content"])
    lower_title = title.lower()

    # Apply skip rules based on title
    if "book review" in lower_title:
        print(f"[SKIP] Book Review: {title}")
        continue

    if lower_title.startswith("brief report") or \
       lower_title.startswith("introduction to") or \
       lower_title.startswith("editorial open praxis"):
        print(f"[SKIP] Editorial filtered: {title}")
        continue

    filename = f"{title}.pdf"
    path = os.path.join(folder, filename)

    print(f"[{count+1}] Downloading: {filename}")
    try:
        pdf = requests.get(pdf_url, headers=HEADERS)
        if "application/pdf" not in pdf.headers.get("Content-Type", ""):
            print(f"[SKIP] Not a PDF: {title}")
            continue
        with open(path, "wb") as f:
            f.write(pdf.content)
        count += 1
        print(f"[OK] Saved: {filename}")
    except Exception as e:
        print(f"[ERROR] Downloading {title}: {e}")

print(f"\nDone! {count} PDFs saved in {folder}")
